{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f80b8618-abf6-4763-89d9-20b831c4ea98",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Hippocorpus converter"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b545caa0-0a32-4007-8f17-8fbdc2f1dd37",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aefa8aac-4ab9-4b5a-b3e0-c65baa8da873",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "hippocorpus = pd.read_csv(\"hippocorpus/hcV3-stories.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebf19352-7c90-4bdf-bc90-5e328e64161d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# There is a surprising number of people who seem to have left capslock on while participating in the data collection process.\n",
    "# These entries tend to be of lower than average quality and would be impossible to fully restore without more complex methods, so they are excluded\n",
    "hippocorpus = hippocorpus[~hippocorpus[\"mainEvent\"].str.isupper()]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "968bbf1e-78b3-4436-8a3f-9335b4d2801a",
   "metadata": {},
   "source": [
    "## Convert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec73fd44-b209-4bea-a1c9-711251747647",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from random import choice, random, randrange\n",
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "nltk.download(\"punkt\")\n",
    "\n",
    "\n",
    "def replace_my(string):\n",
    "    match = re.search(r\"my (\\w+)\", string)\n",
    "    if match:\n",
    "        word = match.group(1)\n",
    "        if word[0] in \"aeiou\":\n",
    "            string = re.sub(r\"my\", \"an\", string, 1)\n",
    "        else:\n",
    "            string = re.sub(r\"my\", \"a\", string, 1)\n",
    "    return string\n",
    "\n",
    "\n",
    "def sure():\n",
    "    ack = choice([\"Sure\", \"Of course\", \"Alright\", \"Certainly\"])\n",
    "    punctuation = choice([\",\", \"!\", \".\"])\n",
    "    return ack + punctuation\n",
    "\n",
    "\n",
    "def convert_row(row):\n",
    "    interaction = \"\"\n",
    "    main_event = row[\"mainEvent\"].rstrip(\"!.?;:\")\n",
    "    main_event = main_event[0].lower() + main_event[1:]\n",
    "    main_event = replace_my(main_event)\n",
    "    an_original = choice([\"a\", \"an original\"])\n",
    "    write = choice([\"Write\", \"Write me\", \"Please write\"])\n",
    "    instruction = f\"{write} {an_original} story about {main_event}.\"\n",
    "    interaction += f\"User: {instruction}\"\n",
    "\n",
    "    story = row[\"story\"]\n",
    "    do_sentence_instruction = random() > 0.5\n",
    "    if do_sentence_instruction:\n",
    "        sentences = sent_tokenize(story)\n",
    "        sentence_index = randrange(len(sentences))\n",
    "        if sentence_index == 0:\n",
    "            interaction += \" Make the first sentence \"\n",
    "            sentence_response_section = f\" where the first sentence is \"\n",
    "        elif sentence_index == len(sentences) - 1:\n",
    "            interaction += \" Make the last sentence \"\n",
    "            sentence_response_section = f\" where the last sentence is \"\n",
    "        else:\n",
    "            interaction += \" Include the sentence \"\n",
    "            sentence_response_section = f\" which includes the sentence \"\n",
    "        interaction += f'\"{sentences[sentence_index]}\"'\n",
    "        sentence_response_section += f'\"{sentences[sentence_index]}\"'\n",
    "    else:\n",
    "        sentence_response_section = \"\"\n",
    "    interaction += \"\\n\\n\"\n",
    "\n",
    "    interaction += f\"Rosey: {sure()} Here's a story about {main_event}{sentence_response_section}.\\n\\n{story}\"\n",
    "    interaction += \"\\n\\n\"\n",
    "\n",
    "    def most_surprising(interaction):\n",
    "        most_surprising = row[\"mostSurprising\"]\n",
    "        most_surprising = most_surprising[0].lower() + most_surprising[1:]\n",
    "        was = choice([\"was\", \"do you think was\", \"would you say was\", \"do you think someone would say was\"])\n",
    "        surprising = choice(\n",
    "            [\"the most surprising thing\", \"one of the most surprising things\", \"a surprising development\"]\n",
    "        )\n",
    "        interaction += f\"User: What {was} {surprising} in that story?\\n\\n\"\n",
    "        id_say = choice([\"I'd say the\", \"I would have to say the\", \"The\", \"This story's\"])\n",
    "        interaction += f\"Rosey: {id_say} most surprising development was {most_surprising}.\"\n",
    "        return interaction\n",
    "\n",
    "    def summarize(interaction):\n",
    "        preamble = choice(\n",
    "            [\"The story is a little long. \", \"This is longer than I was expecting. \", \"It needs to be shorter. \", \"\"]\n",
    "        )\n",
    "        verb = choice(\n",
    "            [\"shorten it to a sentence or two\", \"summarize it\", \"shrink it way down\", \"make it way more terse\"]\n",
    "        )\n",
    "        request = choice([\"Can you \", \"I need you to \", \"Please \"])\n",
    "        interaction += f\"User: {preamble}{request}{verb}.\\n\\n\"\n",
    "        interaction += f\"Rosey: {sure()} Here's a summary of the story:\\n\\n{row['summary']}\"\n",
    "        return interaction\n",
    "\n",
    "    (first, second) = (most_surprising, summarize)\n",
    "    if random() > 0.5:\n",
    "        (first, second) = (second, first)\n",
    "    interaction = first(interaction)\n",
    "    interaction += \"\\n\\n\"\n",
    "    interaction = second(interaction)\n",
    "\n",
    "    return interaction\n",
    "\n",
    "\n",
    "hippocorpus = hippocorpus.apply(convert_row, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6fadffd-f0b0-44f9-abf5-41fad3c26738",
   "metadata": {},
   "source": [
    "## Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b0d047d-8cb4-4621-80bc-630545c1c309",
   "metadata": {},
   "outputs": [],
   "source": [
    "hippocorpus.to_csv(\"hippocorpus.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Open-Assistant",
   "language": "python",
   "name": "open-assistant"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
